33#define DEBUG_TYPE "gcn-subtarget"
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
42 "amdgpu-vgpr-index-mode",
43 cl::desc(
"Use GPR indexing mode instead of movrel for vector indexing"),
47 cl::desc(
"Enable the use of AA during codegen."),
52 cl::desc(
"Number of addresses from which to enable MIMG NSA."),
74 FullFS +=
"+flat-for-global,+unaligned-access-mode,+trap-handler,";
76 FullFS +=
"+enable-prt-strict-null,";
79 if (FS.contains_insensitive(
"+wavefrontsize")) {
80 if (!FS.contains_insensitive(
"wavefrontsize16"))
81 FullFS +=
"-wavefrontsize16,";
82 if (!FS.contains_insensitive(
"wavefrontsize32"))
83 FullFS +=
"-wavefrontsize32,";
84 if (!FS.contains_insensitive(
"wavefrontsize64"))
85 FullFS +=
"-wavefrontsize64,";
102 }
else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
107 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
123 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
129 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
145 !getFeatureBits().
test(AMDGPU::FeatureCuMode))
163 if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
164 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
166 F,
"must specify exactly one of wavefrontsize32 and wavefrontsize64"));
177 InstrItins(getInstrItineraryForCPU(GPU)),
178 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
185 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
188 InlineAsmLoweringInfo =
190 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*
this, TM);
191 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*
this);
193 std::make_unique<AMDGPUInstructionSelector>(*
this, *RegBankInfo, TM);
205 case AMDGPU::V_LSHLREV_B64_e64:
206 case AMDGPU::V_LSHLREV_B64_gfx10:
207 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
208 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
209 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
210 case AMDGPU::V_LSHL_B64_e64:
211 case AMDGPU::V_LSHRREV_B64_e64:
212 case AMDGPU::V_LSHRREV_B64_gfx10:
213 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
214 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
215 case AMDGPU::V_LSHR_B64_e64:
216 case AMDGPU::V_ASHRREV_I64_e64:
217 case AMDGPU::V_ASHRREV_I64_gfx10:
218 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
219 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
220 case AMDGPU::V_ASHR_I64_e64:
230 case AMDGPU::V_CVT_F16_F32_e32:
231 case AMDGPU::V_CVT_F16_F32_e64:
232 case AMDGPU::V_CVT_F16_U16_e32:
233 case AMDGPU::V_CVT_F16_U16_e64:
234 case AMDGPU::V_CVT_F16_I16_e32:
235 case AMDGPU::V_CVT_F16_I16_e64:
236 case AMDGPU::V_RCP_F16_e64:
237 case AMDGPU::V_RCP_F16_e32:
238 case AMDGPU::V_RSQ_F16_e64:
239 case AMDGPU::V_RSQ_F16_e32:
240 case AMDGPU::V_SQRT_F16_e64:
241 case AMDGPU::V_SQRT_F16_e32:
242 case AMDGPU::V_LOG_F16_e64:
243 case AMDGPU::V_LOG_F16_e32:
244 case AMDGPU::V_EXP_F16_e64:
245 case AMDGPU::V_EXP_F16_e32:
246 case AMDGPU::V_SIN_F16_e64:
247 case AMDGPU::V_SIN_F16_e32:
248 case AMDGPU::V_COS_F16_e64:
249 case AMDGPU::V_COS_F16_e32:
250 case AMDGPU::V_FLOOR_F16_e64:
251 case AMDGPU::V_FLOOR_F16_e32:
252 case AMDGPU::V_CEIL_F16_e64:
253 case AMDGPU::V_CEIL_F16_e32:
254 case AMDGPU::V_TRUNC_F16_e64:
255 case AMDGPU::V_TRUNC_F16_e32:
256 case AMDGPU::V_RNDNE_F16_e64:
257 case AMDGPU::V_RNDNE_F16_e32:
258 case AMDGPU::V_FRACT_F16_e64:
259 case AMDGPU::V_FRACT_F16_e32:
260 case AMDGPU::V_FREXP_MANT_F16_e64:
261 case AMDGPU::V_FREXP_MANT_F16_e32:
262 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
263 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
264 case AMDGPU::V_LDEXP_F16_e64:
265 case AMDGPU::V_LDEXP_F16_e32:
266 case AMDGPU::V_LSHLREV_B16_e64:
267 case AMDGPU::V_LSHLREV_B16_e32:
268 case AMDGPU::V_LSHRREV_B16_e64:
269 case AMDGPU::V_LSHRREV_B16_e32:
270 case AMDGPU::V_ASHRREV_I16_e64:
271 case AMDGPU::V_ASHRREV_I16_e32:
272 case AMDGPU::V_ADD_U16_e64:
273 case AMDGPU::V_ADD_U16_e32:
274 case AMDGPU::V_SUB_U16_e64:
275 case AMDGPU::V_SUB_U16_e32:
276 case AMDGPU::V_SUBREV_U16_e64:
277 case AMDGPU::V_SUBREV_U16_e32:
278 case AMDGPU::V_MUL_LO_U16_e64:
279 case AMDGPU::V_MUL_LO_U16_e32:
280 case AMDGPU::V_ADD_F16_e64:
281 case AMDGPU::V_ADD_F16_e32:
282 case AMDGPU::V_SUB_F16_e64:
283 case AMDGPU::V_SUB_F16_e32:
284 case AMDGPU::V_SUBREV_F16_e64:
285 case AMDGPU::V_SUBREV_F16_e32:
286 case AMDGPU::V_MUL_F16_e64:
287 case AMDGPU::V_MUL_F16_e32:
288 case AMDGPU::V_MAX_F16_e64:
289 case AMDGPU::V_MAX_F16_e32:
290 case AMDGPU::V_MIN_F16_e64:
291 case AMDGPU::V_MIN_F16_e32:
292 case AMDGPU::V_MAX_U16_e64:
293 case AMDGPU::V_MAX_U16_e32:
294 case AMDGPU::V_MIN_U16_e64:
295 case AMDGPU::V_MIN_U16_e32:
296 case AMDGPU::V_MAX_I16_e64:
297 case AMDGPU::V_MAX_I16_e32:
298 case AMDGPU::V_MIN_I16_e64:
299 case AMDGPU::V_MIN_I16_e32:
300 case AMDGPU::V_MAD_F16_e64:
301 case AMDGPU::V_MAD_U16_e64:
302 case AMDGPU::V_MAD_I16_e64:
303 case AMDGPU::V_FMA_F16_e64:
304 case AMDGPU::V_DIV_FIXUP_F16_e64:
307 case AMDGPU::V_MADAK_F16:
308 case AMDGPU::V_MADMK_F16:
309 case AMDGPU::V_MAC_F16_e64:
310 case AMDGPU::V_MAC_F16_e32:
311 case AMDGPU::V_FMAMK_F16:
312 case AMDGPU::V_FMAAK_F16:
313 case AMDGPU::V_FMAC_F16_e64:
314 case AMDGPU::V_FMAC_F16_e32:
319 case AMDGPU::V_MAD_MIXLO_F16:
320 case AMDGPU::V_MAD_MIXHI_F16:
327 unsigned NumRegionInstrs)
const {
347 for (
auto &
MBB : MF) {
405 unsigned NumVGPRs)
const {
416 const Function &
F, std::pair<unsigned, unsigned> WavesPerEU,
417 unsigned PreloadedSGPRs,
unsigned ReservedNumSGPRs)
const {
421 unsigned MaxAddressableNumSGPRs =
getMaxNumSGPRs(WavesPerEU.first,
true);
425 if (
F.hasFnAttribute(
"amdgpu-num-sgpr")) {
427 F.getFnAttributeAsParsedInteger(
"amdgpu-num-sgpr", MaxNumSGPRs);
430 if (Requested && (Requested <= ReservedNumSGPRs))
440 unsigned InputNumSGPRs = PreloadedSGPRs;
441 if (Requested && Requested < InputNumSGPRs)
442 Requested = InputNumSGPRs;
446 if (Requested && Requested >
getMaxNumSGPRs(WavesPerEU.first,
false))
448 if (WavesPerEU.second && Requested &&
453 MaxNumSGPRs = Requested;
459 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
472 const unsigned MaxUserSGPRs =
473 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
474 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
475 USI::getNumUserSGPRForField(USI::QueuePtrID) +
476 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
477 USI::getNumUserSGPRForField(USI::DispatchIdID) +
478 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
479 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
482 const unsigned MaxSystemSGPRs = 1 +
489 const unsigned SyntheticSGPRs = 1;
491 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
500 const Function &
F, std::pair<unsigned, unsigned> WavesPerEU)
const {
507 if (
F.hasFnAttribute(
"amdgpu-num-vgpr")) {
509 F.getFnAttributeAsParsedInteger(
"amdgpu-num-vgpr", MaxNumVGPRs);
518 if (WavesPerEU.second && Requested &&
523 MaxNumVGPRs = Requested;
555 for (++
I;
I != E &&
I->isBundledWithPred(); ++
I) {
556 if (
I->modifiesRegister(Reg,
TRI))
568 for (++
I;
I != E &&
I->isBundledWithPred() && Lat; ++
I) {
569 if (
I->readsRegister(Reg,
TRI))
580 DefI, DefOpIdx, UseI, UseOpIdx));
592 "amdgpu-nsa-threshold", -1);
594 return std::max(
Value, 2);
603 const bool IsKernel =
607 const bool HasCalls =
F.hasFnAttribute(
"amdgpu-calls");
610 const bool HasStackObjects =
F.hasFnAttribute(
"amdgpu-stack-objects");
612 if (IsKernel && (!
F.arg_empty() || ST.getImplicitArgNumBytes(
F) != 0))
613 KernargSegmentPtr =
true;
615 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(
F);
616 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
617 PrivateSegmentBuffer =
true;
618 else if (ST.isMesaGfxShader(
F))
619 ImplicitBufferPtr =
true;
622 if (!
F.hasFnAttribute(
"amdgpu-no-dispatch-ptr"))
626 if (!
F.hasFnAttribute(
"amdgpu-no-queue-ptr"))
629 if (!
F.hasFnAttribute(
"amdgpu-no-dispatch-id"))
637 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
638 (
HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
639 !ST.flatScratchIsArchitected()) {
640 FlatScratchInit =
true;
670 NumKernargPreloadSGPRs += NumSGPRs;
671 NumUsedUserSGPRs += NumSGPRs;
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden)
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static unsigned getMaxNumPreloadedSGPRs()
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
AMD GCN specific subclass of TargetSubtarget.
This file describes how to lower LLVM inline asm to machine code INLINEASM.
unsigned const TargetRegisterInfo * TRI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned AddressableLocalMemorySize
void setTargetIDFromFeaturesString(StringRef FS)
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
Diagnostic information for unsupported feature in backend.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
bool hasSGPRInitBug() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
const SITargetLowering * getTargetLowering() const override
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool useAA() const override
bool hasVGPRIndexMode() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool isXNACKEnabled() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
static unsigned getNumUserSGPRForField(UserSGPRID ID)
bool hasKernargSegmentPtr() const
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
bool hasFlatScratchInit() const
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
instr_iterator instr_end()
Instructions::const_iterator const_instr_iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
Kind getKind() const
Returns an enum value representing the kind of the dependence.
@ Data
Regular data dependence (aka true-dependence).
void setLatency(unsigned Lat)
Sets the latency for this edge.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
unsigned getReg() const
Returns the register associated with this edge.
const TargetSchedModel & getSchedModel() const
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getNumPreloadedSGPRs() const
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
StringRef - Represent a constant reference to a string, i.e.
Information about stack frame layout on the target.
Provide an instruction scheduling machine model to CodeGen passes.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
A Use represents the edge between a Value definition and its users.
LLVM Value Representation.
self_iterator getIterator()
@ FIXED_NUM_SGPRS_FOR_INIT_BUG
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
bool isGFX10Plus(const MCSubtargetInfo &STI)
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.