llvm · jdenny-ornl · Dec 16, 2023 · Dec 16, 2023 · Dec 21, 2023 · Dec 23, 2023
diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst
@@ -61,3 +61,56 @@ behavior so you can position ``kernel-info`` explicitly:
   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
       -pass-remarks=kernel-info -no-kernel-info-end-lto \
       -passes='module(kernel-info),lto<O2>'
+
+PGO
+===
+
+Using LLVM's PGO implementation for GPUs, profile data can augment the info
+reported by kernel-info.  In particular, kernel-info can estimate the number of
+floating point operations executed or bytes moved.
+
+For example, the following computes 2\ :sup:`4`\ , so we expect 4 fmul
+instructions to execute at run time, and we expect a load and store for ``x``:
+
+.. code-block:: shell
+
+  $ cat test.c
+  #include <stdio.h>
+  #include <stdlib.h>
+  __attribute__((noinline))
+  double test(double x, int n) {
+    double res = 1;
+    for (int i = 0; i < n; ++i)
+      res *= x;
+    return res;
+  }
+  int main(int argc, char *argv[]) {
+    double x = atof(argv[1]);
+    unsigned n = atoi(argv[2]);
+    #pragma omp target map(tofrom:x)
+    x = test(x, n);
+    printf("%f\n", x);
+    return 0;
+  }
+
+  $ clang -O1 -g -fopenmp --offload-arch=native test.c -o test \
+        -fprofile-generate -fprofile-update=atomic
+
+  $ LLVM_PROFILE_FILE=test.profraw ./test 2 4
+  16.000000
+
+  $ llvm-profdata merge -output=test.profdata *.profraw
+
+  $ clang -O1 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+        -Rpass=kernel-info -fprofile-use=test.profdata | \
+      grep "test.c:.*Floating\|double"
+  test.c:14:14: in artificial function '__omp_offloading_34_1c64d55_main_l13', double 'load' ('%11') moved 8 fp bytes
+  test.c:14:7: in artificial function '__omp_offloading_34_1c64d55_main_l13', double 'store' moved 8 fp bytes
+  test.c:13:0: in artificial function '__omp_offloading_34_1c64d55_main_l13', ProfileFloatingPointOpCount = 0
+  test.c:13:0: in artificial function '__omp_offloading_34_1c64d55_main_l13', ProfileFloatingPointBytesMoved = 16
+  test.c:7:11: in function 'test', double 'fmul' ('%9') executed 4 flops
+  test.c:4:0: in function 'test', ProfileFloatingPointOpCount = 4
+  test.c:4:0: in function 'test', ProfileFloatingPointBytesMoved = 0
+
+While ``-fprofile-update=atomic`` is not required for the simple example above,
+it can be critical while profiling parallel code.
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
@@ -31,7 +31,8 @@ namespace {
 
 /// Data structure holding function info for kernels.
 class KernelInfo {
-  void updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE);
+  void updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI,
+                   OptimizationRemarkEmitter &ORE);
 
 public:
   static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
@@ -73,10 +74,120 @@ class KernelInfo {
 
   /// Number of flat address space memory accesses (via load, store, etc.).
   int64_t FlatAddrspaceAccesses = 0;
+
+  /// Estimate of the number of floating point operations typically executed
+  /// based on any available profile data.  If no profile data is available, the
+  /// count is zero.
+  uint64_t ProfileFloatingPointOpCount = 0;
+
+  /// Estimate of the number bytes of floating point memory typically moved
+  /// (e.g., load or store) based on any available profile data.  If no profile
+  /// data is available, the count is zero.  LLVM memory access operations
+  /// (e.g., llvm.memcpy.*, cmpxchg) that are always encoded as operating on
+  /// integer types and never on floating point types are not included.
+  uint64_t ProfileFloatingPointBytesMoved = 0;
 };
 
 } // end anonymous namespace
 
+// For the purposes of KernelInfo::ProfileFloatingPointOpCount, should the
+// specified Instruction be considered a floating point operation?  If so,
+// return the floating point type and a multiplier for its FLOP count.
+// Otherwise, return std::nullopt.
+//
+// TODO: Does this correctly identify floating point operations we care about?
+// For example, we skip phi even when it returns a floating point value, and
+// load is covered by KernelInfo::ProfileFloatingPointBytesMoved instead.  Is
+// there anything missing that should be covered here?  Is there anything else
+// that we should exclude?  For example, at least for AMD GPU, there are
+// floating point instruction patterns (e.g., fmul with one operand in some
+// category of immediate) that lower to instructions that do not trigger AMD's
+// floating point hardware counters.  Should we somehow query target-specific
+// lowering to exclude such cases?
+static std::optional<std::pair<Type *, unsigned>>
+getFloatingPointOp(const Instruction &I) {
+  if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
+    if (At->isFloatingPointOperation())
+      return std::make_pair(At->getType(), 1);
+    return std::nullopt;
+  }
+  if (const CastInst *CI = dyn_cast<CastInst>(&I)) {
+    Type *SrcTy = CI->getSrcTy();
+    Type *DestTy = CI->getDestTy();
+    // For AMD GPU, conversions between fp and integer types where either is not
+    // 64-bit lower to instructions that do not trigger AMD's floating point
+    // hardware counters.  TODO: Is that true for all archs, all non-64-bit
+    // floating point types, and all non-64-bit integer types?  On AMD GPU, we
+    // have checked 64 vs. 32 and 32 vs. 32 so far.
+    if (SrcTy->getScalarSizeInBits() != 64 ||
+        DestTy->getScalarSizeInBits() != 64)
+      return std::nullopt;
+    // For AMD GPU, uitofp and sitofp lower to FADD instructions.  TODO: Is that
+    // true for all archs?
+    if (isa<UIToFPInst>(I) || isa<SIToFPInst>(I))
+      return std::make_pair(DestTy, 1);
+    // For AMD GPU, fptoui and fptosi lower to FMA instructions.  Thus, as for
+    // FMA instructions below, we mutliply by 2.  TODO: Is that true for all
+    // archs?
+    if (isa<FPToUIInst>(I) || isa<FPToSIInst>(I))
+      return std::make_pair(SrcTy, 2);
+    return std::nullopt;
+  }
+  Type *Ty = I.getType();
+  if (!Ty->isFPOrFPVectorTy())
+    return std::nullopt;
+  if (I.isBinaryOp() || I.isUnaryOp()) {
+    switch (I.getOpcode()) {
+    // For AMD GPU, fneg lowers to instructions that do not trigger AMD's
+    // floating point hardware counters.  TODO: Is that true for all archs and
+    // all floating point types?  On AMD GPU, we have check 64 bit.
+    case Instruction::FNeg:
+      return std::nullopt;
+    // This multiplier is based on AMD hardware fp counters for fdiv:
+    // - SQ_INSTS_VALU_FMA_F64   = 6*2
+    // - SQ_INSTS_VALU_MUL_F64   = 1
+    // - SQ_INSTS_VALU_TRANS_F64 = 1
+    // TODO: Is that true for all archs and all floating point types?  On AMD
+    // GPU, we have checked 64 bit.  Moreover, this is surely brittle.  What if
+    // the implementation changes?
+    case Instruction::FDiv:
+      return std::make_pair(Ty, 14);
+    }
+    return std::make_pair(Ty, 1);
+  }
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+    switch (II->getIntrinsicID()) {
+    // For AMD GPU, these lower to instructions that do not trigger AMD's
+    // floating point hardware counters.  TODO: Is that true for all archs and
+    // all floating point types?  On AMD GPU, we have checked 64 bit.
+    case Intrinsic::copysign:
+    case Intrinsic::fabs:
+    case Intrinsic::floor:
+    case Intrinsic::ldexp:
+    case Intrinsic::minnum:
+    case Intrinsic::rint:
+      return std::nullopt;
+    // For FMA instructions, we mimic AMD's rocprofiler-compute, which
+    // multiplies SQ_INSTS_VALU_FMA_* counts by 2.
+    case Intrinsic::fmuladd:
+    case Intrinsic::fma:
+      return std::make_pair(Ty, 2);
+    // This multiplier is based on AMD hardware fp counters for this intrinsic:
+    // - SQ_INSTS_VALU_FMA_F64   = 7*2
+    // - SQ_INSTS_VALU_MUL_F64   = 2
+    // - SQ_INSTS_VALU_TRANS_F64 = 1
+    // TODO: Is that true for all archs and all floating point types?  On AMD
+    // GPU, we have check 64 bit.  Moreover, this is surely brittle.  What if
+    // the implementation changes?
+    case Intrinsic::sqrt:
+      return std::make_pair(Ty, 17);
+    default:
+      return std::make_pair(Ty, 1);
+    }
+  }
+  return std::nullopt;
+}
+
 static void identifyCallee(OptimizationRemark &R, const Module *M,
                            const Value *V, StringRef Kind = "") {
   SmallString<100> Name; // might be function name or asm expression
@@ -100,6 +211,19 @@ static void identifyFunction(OptimizationRemark &R, const Function &F) {
   identifyCallee(R, F.getParent(), &F, "function");
 }
 
+static void identifyInstruction(OptimizationRemark &R, const Instruction &I) {
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
+    R << "'" << II->getCalledFunction()->getName() << "' call";
+  else
+    R << "'" << I.getOpcodeName() << "'";
+  if (!I.getType()->isVoidTy()) {
+    SmallString<20> Name;
+    raw_svector_ostream OS(Name);
+    I.printAsOperand(OS, /*PrintType=*/false, I.getModule());
+    R << " ('" << Name << "')";
+  }
+}
+
 static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller,
                          const AllocaInst &Alloca,
                          TypeSize::ScalarTy StaticSize) {
@@ -153,33 +277,69 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
 
 static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
                                       const Function &Caller,
-                                      const Instruction &Inst) {
+                                      const Instruction &I) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &I);
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", ";
+    identifyInstruction(R, I);
+    R << " accesses memory in flat address space";
+    return R;
+  });
+}
+
+static void
+remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                      const Instruction &I, Type *Ty, unsigned Multiplier,
+                      std::optional<uint64_t> BlockProfileCount,
+                      std::optional<uint64_t> BytesMoved = std::nullopt) {
   ORE.emit([&] {
-    OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst);
+    OptimizationRemark R(DEBUG_TYPE,
+                         BytesMoved ? "ProfileFloatingPointBytesMoved"
+                                    : "ProfileFloatingPointOpCount",
+                         &I);
     R << "in ";
     identifyFunction(R, Caller);
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst)) {
-      R << ", '" << II->getCalledFunction()->getName() << "' call";
+    R << ", ";
+    SmallString<10> TyName;
+    raw_svector_ostream OS(TyName);
+    Ty->print(OS);
+    R << TyName << " ";
+    identifyInstruction(R, I);
+    if (BlockProfileCount) {
+      if (BytesMoved)
+        R << " moved " << itostr(*BytesMoved * *BlockProfileCount)
+          << " fp bytes";
+      else
+        R << " executed " << utostr(*BlockProfileCount) << " flops";
+      if (Multiplier != 1)
+        R << " x " << utostr(Multiplier);
     } else {
-      R << ", '" << Inst.getOpcodeName() << "' instruction";
+      R << " has no profile data";
     }
-    if (!Inst.getType()->isVoidTy()) {
-      SmallString<20> Name;
-      raw_svector_ostream OS(Name);
-      Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
-      R << " ('" << Name << "')";
-    }
-    R << " accesses memory in flat address space";
     return R;
   });
 }
 
-void KernelInfo::updateForBB(const BasicBlock &BB,
+void KernelInfo::updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI,
                              OptimizationRemarkEmitter &ORE) {
   const Function &F = *BB.getParent();
   const Module &M = *F.getParent();
   const DataLayout &DL = M.getDataLayout();
+  // TODO: Is AllowSynthetic what we want?
+  std::optional<uint64_t> BlockProfileCount =
+      BFI.getBlockProfileCount(&BB, /*AllowSynthetic=*/true);
   for (const Instruction &I : BB.instructionsWithoutDebug()) {
+    auto HandleFloatingPointBytesMoved = [&]() {
+      Type *Ty = I.getAccessType();
+      if (!Ty || !Ty->isFPOrFPVectorTy())
+        return;
+      TypeSize::ScalarTy Size = DL.getTypeStoreSize(Ty).getFixedValue();
+      ProfileFloatingPointBytesMoved += BlockProfileCount.value_or(0) * Size;
+      remarkFloatingPointOp(ORE, F, I, Ty, /*Multiplier=*/1, BlockProfileCount,
+                            Size);
+    };
     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
       ++Allocas;
       TypeSize::ScalarTy StaticSize = 0;
@@ -237,38 +397,58 @@ void KernelInfo::updateForBB(const BasicBlock &BB,
             remarkFlatAddrspaceAccess(ORE, F, I);
           }
         }
+        // llvm.memcpy.*, llvm.memset.*, etc. are encoded as operating on
+        // integer types not floating point types, so
+        // HandleFloatingPointBytesMoved is useless here.
       }
     } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
       if (Load->getPointerAddressSpace() == FlatAddrspace) {
         ++FlatAddrspaceAccesses;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
+      HandleFloatingPointBytesMoved();
     } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
       if (Store->getPointerAddressSpace() == FlatAddrspace) {
         ++FlatAddrspaceAccesses;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
+      HandleFloatingPointBytesMoved();
     } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
       if (At->getPointerAddressSpace() == FlatAddrspace) {
         ++FlatAddrspaceAccesses;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
+      HandleFloatingPointBytesMoved();
     } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
       if (At->getPointerAddressSpace() == FlatAddrspace) {
         ++FlatAddrspaceAccesses;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
+      // cmpxchg is encoded as operating on integer types not floating point
+      // types, so HandleFloatingPointBytesMoved is useless here.
+    }
+    if (auto Op = getFloatingPointOp(I)) {
+      Type *Ty;
+      unsigned Multiplier;
+      std::tie(Ty, Multiplier) = *Op;
+      ProfileFloatingPointOpCount += Multiplier * BlockProfileCount.value_or(0);
+      remarkFloatingPointOp(ORE, F, I, Ty, Multiplier, BlockProfileCount);
     }
   }
 }
 
-static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
-                           StringRef Name, int64_t Value) {
+static std::string toString(bool Val) { return itostr(Val); }
+static std::string toString(int64_t Val) { return itostr(Val); }
+static std::string toString(uint64_t Val) { return utostr(Val); }
+
+template <typename T>
+void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
+                    StringRef Name, T Val) {
   ORE.emit([&] {
     OptimizationRemark R(DEBUG_TYPE, Name, &F);
     R << "in ";
     identifyFunction(R, F);
-    R << ", " << Name << " = " << itostr(Value);
+    R << ", " << Name << " = " << toString(Val);
     return R;
   });
 }
@@ -284,6 +464,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
                                 TargetMachine *TM) {
   KernelInfo KI;
   TargetTransformInfo &TheTTI = FAM.getResult<TargetIRAnalysis>(F);
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   KI.FlatAddrspace = TheTTI.getFlatAddressSpace();
 
   // Record function properties.
@@ -296,7 +477,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
 
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   for (const auto &BB : F)
-    KI.updateForBB(BB, ORE);
+    KI.updateForBB(BB, BFI, ORE);
 
 #define REMARK_PROPERTY(PROP_NAME)                                             \
   remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)
@@ -312,6 +493,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
   REMARK_PROPERTY(InlineAssemblyCalls);
   REMARK_PROPERTY(Invokes);
   REMARK_PROPERTY(FlatAddrspaceAccesses);
+  REMARK_PROPERTY(ProfileFloatingPointOpCount);
+  REMARK_PROPERTY(ProfileFloatingPointBytesMoved);
 #undef REMARK_PROPERTY
 }