diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst
index dac642f1ffc65..071f28898c73d 100644
--- a/llvm/docs/KernelInfo.rst
+++ b/llvm/docs/KernelInfo.rst
@@ -61,3 +61,56 @@ behavior so you can position ``kernel-info`` explicitly:
   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
       -pass-remarks=kernel-info -no-kernel-info-end-lto \
       -passes='module(kernel-info),lto<O2>'
+
+PGO
+===
+
+Using LLVM's PGO implementation for GPUs, profile data can augment the info
+reported by kernel-info.  In particular, kernel-info can estimate the number of
+floating point operations executed or bytes moved.
+
+For example, the following computes 2\ :sup:`4`\ , so we expect 4 fmul
+instructions to execute at run time, and we expect a load and store for ``x``:
+
+.. code-block:: shell
+
+  $ cat test.c
+  #include <stdio.h>
+  #include <stdlib.h>
+  __attribute__((noinline))
+  double test(double x, int n) {
+    double res = 1;
+    for (int i = 0; i < n; ++i)
+      res *= x;
+    return res;
+  }
+  int main(int argc, char *argv[]) {
+    double x = atof(argv[1]);
+    unsigned n = atoi(argv[2]);
+    #pragma omp target map(tofrom:x)
+    x = test(x, n);
+    printf("%f\n", x);
+    return 0;
+  }
+
+  $ clang -O1 -g -fopenmp --offload-arch=native test.c -o test \
+        -fprofile-generate -fprofile-update=atomic
+
+  $ LLVM_PROFILE_FILE=test.profraw ./test 2 4
+  16.000000
+
+  $ llvm-profdata merge -output=test.profdata *.profraw
+
+  $ clang -O1 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+        -Rpass=kernel-info -fprofile-use=test.profdata | \
+      grep "test.c:.*Floating\|double"
+  test.c:14:14: in artificial function '__omp_offloading_34_1c64d55_main_l13', double 'load' ('%11') moved 8 fp bytes
+  test.c:14:7: in artificial function '__omp_offloading_34_1c64d55_main_l13', double 'store' moved 8 fp bytes
+  test.c:13:0: in artificial function '__omp_offloading_34_1c64d55_main_l13', ProfileFloatingPointOpCount = 0
+  test.c:13:0: in artificial function '__omp_offloading_34_1c64d55_main_l13', ProfileFloatingPointBytesMoved = 16
+  test.c:7:11: in function 'test', double 'fmul' ('%9') executed 4 flops
+  test.c:4:0: in function 'test', ProfileFloatingPointOpCount = 4
+  test.c:4:0: in function 'test', ProfileFloatingPointBytesMoved = 0
+
+While ``-fprofile-update=atomic`` is not required for the simple example above,
+it can be critical while profiling parallel code.
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 93dd7cecb32e1..abe9c2ce25c17 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -31,7 +31,8 @@ namespace {
 
 /// Data structure holding function info for kernels.
 class KernelInfo {
-  void updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE);
+  void updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI,
+                   OptimizationRemarkEmitter &ORE);
 
 public:
   static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
@@ -73,10 +74,120 @@ class KernelInfo {
 
   /// Number of flat address space memory accesses (via load, store, etc.).
   int64_t FlatAddrspaceAccesses = 0;
+
+  /// Estimate of the number of floating point operations typically executed
+  /// based on any available profile data.  If no profile data is available, the
+  /// count is zero.
+  uint64_t ProfileFloatingPointOpCount = 0;
+
+  /// Estimate of the number bytes of floating point memory typically moved
+  /// (e.g., load or store) based on any available profile data.  If no profile
+  /// data is available, the count is zero.  LLVM memory access operations
+  /// (e.g., llvm.memcpy.*, cmpxchg) that are always encoded as operating on
+  /// integer types and never on floating point types are not included.
+  uint64_t ProfileFloatingPointBytesMoved = 0;
 };
 
 } // end anonymous namespace
 
+// For the purposes of KernelInfo::ProfileFloatingPointOpCount, should the
+// specified Instruction be considered a floating point operation?  If so,
+// return the floating point type and a multiplier for its FLOP count.
+// Otherwise, return std::nullopt.
+//
+// TODO: Does this correctly identify floating point operations we care about?
+// For example, we skip phi even when it returns a floating point value, and
+// load is covered by KernelInfo::ProfileFloatingPointBytesMoved instead.  Is
+// there anything missing that should be covered here?  Is there anything else
+// that we should exclude?  For example, at least for AMD GPU, there are
+// floating point instruction patterns (e.g., fmul with one operand in some
+// category of immediate) that lower to instructions that do not trigger AMD's
+// floating point hardware counters.  Should we somehow query target-specific
+// lowering to exclude such cases?
+static std::optional<std::pair<Type *, unsigned>>
+getFloatingPointOp(const Instruction &I) {
+  if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
+    if (At->isFloatingPointOperation())
+      return std::make_pair(At->getType(), 1);
+    return std::nullopt;
+  }
+  if (const CastInst *CI = dyn_cast<CastInst>(&I)) {
+    Type *SrcTy = CI->getSrcTy();
+    Type *DestTy = CI->getDestTy();
+    // For AMD GPU, conversions between fp and integer types where either is not
+    // 64-bit lower to instructions that do not trigger AMD's floating point
+    // hardware counters.  TODO: Is that true for all archs, all non-64-bit
+    // floating point types, and all non-64-bit integer types?  On AMD GPU, we
+    // have checked 64 vs. 32 and 32 vs. 32 so far.
+    if (SrcTy->getScalarSizeInBits() != 64 ||
+        DestTy->getScalarSizeInBits() != 64)
+      return std::nullopt;
+    // For AMD GPU, uitofp and sitofp lower to FADD instructions.  TODO: Is that
+    // true for all archs?
+    if (isa<UIToFPInst>(I) || isa<SIToFPInst>(I))
+      return std::make_pair(DestTy, 1);
+    // For AMD GPU, fptoui and fptosi lower to FMA instructions.  Thus, as for
+    // FMA instructions below, we mutliply by 2.  TODO: Is that true for all
+    // archs?
+    if (isa<FPToUIInst>(I) || isa<FPToSIInst>(I))
+      return std::make_pair(SrcTy, 2);
+    return std::nullopt;
+  }
+  Type *Ty = I.getType();
+  if (!Ty->isFPOrFPVectorTy())
+    return std::nullopt;
+  if (I.isBinaryOp() || I.isUnaryOp()) {
+    switch (I.getOpcode()) {
+    // For AMD GPU, fneg lowers to instructions that do not trigger AMD's
+    // floating point hardware counters.  TODO: Is that true for all archs and
+    // all floating point types?  On AMD GPU, we have check 64 bit.
+    case Instruction::FNeg:
+      return std::nullopt;
+    // This multiplier is based on AMD hardware fp counters for fdiv:
+    // - SQ_INSTS_VALU_FMA_F64   = 6*2
+    // - SQ_INSTS_VALU_MUL_F64   = 1
+    // - SQ_INSTS_VALU_TRANS_F64 = 1
+    // TODO: Is that true for all archs and all floating point types?  On AMD
+    // GPU, we have checked 64 bit.  Moreover, this is surely brittle.  What if
+    // the implementation changes?
+    case Instruction::FDiv:
+      return std::make_pair(Ty, 14);
+    }
+    return std::make_pair(Ty, 1);
+  }
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+    switch (II->getIntrinsicID()) {
+    // For AMD GPU, these lower to instructions that do not trigger AMD's
+    // floating point hardware counters.  TODO: Is that true for all archs and
+    // all floating point types?  On AMD GPU, we have checked 64 bit.
+    case Intrinsic::copysign:
+    case Intrinsic::fabs:
+    case Intrinsic::floor:
+    case Intrinsic::ldexp:
+    case Intrinsic::minnum:
+    case Intrinsic::rint:
+      return std::nullopt;
+    // For FMA instructions, we mimic AMD's rocprofiler-compute, which
+    // multiplies SQ_INSTS_VALU_FMA_* counts by 2.
+    case Intrinsic::fmuladd:
+    case Intrinsic::fma:
+      return std::make_pair(Ty, 2);
+    // This multiplier is based on AMD hardware fp counters for this intrinsic:
+    // - SQ_INSTS_VALU_FMA_F64   = 7*2
+    // - SQ_INSTS_VALU_MUL_F64   = 2
+    // - SQ_INSTS_VALU_TRANS_F64 = 1
+    // TODO: Is that true for all archs and all floating point types?  On AMD
+    // GPU, we have check 64 bit.  Moreover, this is surely brittle.  What if
+    // the implementation changes?
+    case Intrinsic::sqrt:
+      return std::make_pair(Ty, 17);
+    default:
+      return std::make_pair(Ty, 1);
+    }
+  }
+  return std::nullopt;
+}
+
 static void identifyCallee(OptimizationRemark &R, const Module *M,
                            const Value *V, StringRef Kind = "") {
   SmallString<100> Name; // might be function name or asm expression
@@ -100,6 +211,19 @@ static void identifyFunction(OptimizationRemark &R, const Function &F) {
   identifyCallee(R, F.getParent(), &F, "function");
 }
 
+static void identifyInstruction(OptimizationRemark &R, const Instruction &I) {
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
+    R << "'" << II->getCalledFunction()->getName() << "' call";
+  else
+    R << "'" << I.getOpcodeName() << "'";
+  if (!I.getType()->isVoidTy()) {
+    SmallString<20> Name;
+    raw_svector_ostream OS(Name);
+    I.printAsOperand(OS, /*PrintType=*/false, I.getModule());
+    R << " ('" << Name << "')";
+  }
+}
+
 static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller,
                          const AllocaInst &Alloca,
                          TypeSize::ScalarTy StaticSize) {
@@ -153,33 +277,69 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
 
 static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
                                       const Function &Caller,
-                                      const Instruction &Inst) {
+                                      const Instruction &I) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &I);
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", ";
+    identifyInstruction(R, I);
+    R << " accesses memory in flat address space";
+    return R;
+  });
+}
+
+static void
+remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                      const Instruction &I, Type *Ty, unsigned Multiplier,
+                      std::optional<uint64_t> BlockProfileCount,
+                      std::optional<uint64_t> BytesMoved = std::nullopt) {
   ORE.emit([&] {
-    OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst);
+    OptimizationRemark R(DEBUG_TYPE,
+                         BytesMoved ? "ProfileFloatingPointBytesMoved"
+                                    : "ProfileFloatingPointOpCount",
+                         &I);
     R << "in ";
     identifyFunction(R, Caller);
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst)) {
-      R << ", '" << II->getCalledFunction()->getName() << "' call";
+    R << ", ";
+    SmallString<10> TyName;
+    raw_svector_ostream OS(TyName);
+    Ty->print(OS);
+    R << TyName << " ";
+    identifyInstruction(R, I);
+    if (BlockProfileCount) {
+      if (BytesMoved)
+        R << " moved " << itostr(*BytesMoved * *BlockProfileCount)
+          << " fp bytes";
+      else
+        R << " executed " << utostr(*BlockProfileCount) << " flops";
+      if (Multiplier != 1)
+        R << " x " << utostr(Multiplier);
     } else {
-      R << ", '" << Inst.getOpcodeName() << "' instruction";
+      R << " has no profile data";
     }
-    if (!Inst.getType()->isVoidTy()) {
-      SmallString<20> Name;
-      raw_svector_ostream OS(Name);
-      Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
-      R << " ('" << Name << "')";
-    }
-    R << " accesses memory in flat address space";
     return R;
   });
 }
 
-void KernelInfo::updateForBB(const BasicBlock &BB,
+void KernelInfo::updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI,
                              OptimizationRemarkEmitter &ORE) {
   const Function &F = *BB.getParent();
   const Module &M = *F.getParent();
   const DataLayout &DL = M.getDataLayout();
+  // TODO: Is AllowSynthetic what we want?
+  std::optional<uint64_t> BlockProfileCount =
+      BFI.getBlockProfileCount(&BB, /*AllowSynthetic=*/true);
   for (const Instruction &I : BB.instructionsWithoutDebug()) {
+    auto HandleFloatingPointBytesMoved = [&]() {
+      Type *Ty = I.getAccessType();
+      if (!Ty || !Ty->isFPOrFPVectorTy())
+        return;
+      TypeSize::ScalarTy Size = DL.getTypeStoreSize(Ty).getFixedValue();
+      ProfileFloatingPointBytesMoved += BlockProfileCount.value_or(0) * Size;
+      remarkFloatingPointOp(ORE, F, I, Ty, /*Multiplier=*/1, BlockProfileCount,
+                            Size);
+    };
     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
       ++Allocas;
       TypeSize::ScalarTy StaticSize = 0;
@@ -237,38 +397,58 @@ void KernelInfo::updateForBB(const BasicBlock &BB,
             remarkFlatAddrspaceAccess(ORE, F, I);
           }
         }
+        // llvm.memcpy.*, llvm.memset.*, etc. are encoded as operating on
+        // integer types not floating point types, so
+        // HandleFloatingPointBytesMoved is useless here.
       }
     } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
       if (Load->getPointerAddressSpace() == FlatAddrspace) {
         ++FlatAddrspaceAccesses;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
+      HandleFloatingPointBytesMoved();
     } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
       if (Store->getPointerAddressSpace() == FlatAddrspace) {
         ++FlatAddrspaceAccesses;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
+      HandleFloatingPointBytesMoved();
     } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
       if (At->getPointerAddressSpace() == FlatAddrspace) {
         ++FlatAddrspaceAccesses;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
+      HandleFloatingPointBytesMoved();
     } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
       if (At->getPointerAddressSpace() == FlatAddrspace) {
         ++FlatAddrspaceAccesses;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
+      // cmpxchg is encoded as operating on integer types not floating point
+      // types, so HandleFloatingPointBytesMoved is useless here.
+    }
+    if (auto Op = getFloatingPointOp(I)) {
+      Type *Ty;
+      unsigned Multiplier;
+      std::tie(Ty, Multiplier) = *Op;
+      ProfileFloatingPointOpCount += Multiplier * BlockProfileCount.value_or(0);
+      remarkFloatingPointOp(ORE, F, I, Ty, Multiplier, BlockProfileCount);
     }
   }
 }
 
-static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
-                           StringRef Name, int64_t Value) {
+static std::string toString(bool Val) { return itostr(Val); }
+static std::string toString(int64_t Val) { return itostr(Val); }
+static std::string toString(uint64_t Val) { return utostr(Val); }
+
+template <typename T>
+void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
+                    StringRef Name, T Val) {
   ORE.emit([&] {
     OptimizationRemark R(DEBUG_TYPE, Name, &F);
     R << "in ";
     identifyFunction(R, F);
-    R << ", " << Name << " = " << itostr(Value);
+    R << ", " << Name << " = " << toString(Val);
     return R;
   });
 }
@@ -284,6 +464,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
                                 TargetMachine *TM) {
   KernelInfo KI;
   TargetTransformInfo &TheTTI = FAM.getResult<TargetIRAnalysis>(F);
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   KI.FlatAddrspace = TheTTI.getFlatAddressSpace();
 
   // Record function properties.
@@ -296,7 +477,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
 
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   for (const auto &BB : F)
-    KI.updateForBB(BB, ORE);
+    KI.updateForBB(BB, BFI, ORE);
 
 #define REMARK_PROPERTY(PROP_NAME)                                             \
   remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)
@@ -312,6 +493,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
   REMARK_PROPERTY(InlineAssemblyCalls);
   REMARK_PROPERTY(Invokes);
   REMARK_PROPERTY(FlatAddrspaceAccesses);
+  REMARK_PROPERTY(ProfileFloatingPointOpCount);
+  REMARK_PROPERTY(ProfileFloatingPointBytesMoved);
 #undef REMARK_PROPERTY
 }
 
diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
index b54c3a18f3e70..a5d7fd783ec48 100644
--- a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
+++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
@@ -1,35 +1,35 @@
 define void @f() !dbg !3 {
 entry:
   ; load: check remarks for both unnamed and named values.
-  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%0') accesses memory in flat address space
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' ('%0') accesses memory in flat address space
   %0 = load i32, ptr null, align 4, !dbg !6
-  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' ('%load') accesses memory in flat address space
   %load = load i32, ptr null, align 4, !dbg !6
-  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in flat address space
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' ('%load0') accesses memory in flat address space
   %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6
   %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6
   %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6
 
   ; store
-  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space
+  ; CHECK: remark: test.c:4:6: in function 'f', 'store' accesses memory in flat address space
   store i32 0, ptr null, align 4, !dbg !7
-  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space
+  ; CHECK: remark: test.c:4:6: in function 'f', 'store' accesses memory in flat address space
   store i32 0, ptr addrspace(0) null, align 4, !dbg !7
   store i32 0, ptr addrspace(1) null, align 4, !dbg !7
   store i32 0, ptr addrspace(8) null, align 4, !dbg !7
 
   ; atomicrmw
-  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' ('%[[#]]') accesses memory in flat address space
   atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8
-  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' ('%[[#]]') accesses memory in flat address space
   atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8
   atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8
   atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8
 
   ; cmpxchg
-  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' ('%[[#]]') accesses memory in flat address space
   cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9
-  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' ('%[[#]]') accesses memory in flat address space
   cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
   cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
   cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
diff --git a/llvm/test/Analysis/KernelInfo/flop-pgo.ll b/llvm/test/Analysis/KernelInfo/flop-pgo.ll
new file mode 100644
index 0000000000000..99c6920338a95
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/flop-pgo.ll
@@ -0,0 +1,247 @@
+; Check info on floating point operations.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck %s -match-full-lines \
+; RUN:     -implicit-check-not='executed {{[0-9]+}} flops' \
+; RUN:     -implicit-check-not='moved {{[0-9]+}} fp bytes'
+
+target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; Check function with neither profile data nor floating point operations.
+define double @noFlopOrProf() !dbg !100 {
+  ret double 0.000000e+00, !dbg !105
+}
+; CHECK: remark: test.c:1:0: in function 'noFlopOrProf', ProfileFloatingPointOpCount = 0
+; CHECK: remark: test.c:1:0: in function 'noFlopOrProf', ProfileFloatingPointBytesMoved = 0
+
+; Check function with profile data but no floating point operations.
+define double @noFlop() !dbg !200 !prof !202 {
+  ret double 0.000000e+00, !dbg !205
+}
+; CHECK: remark: test.c:2:0: in function 'noFlop', ProfileFloatingPointOpCount = 0
+; CHECK: remark: test.c:2:0: in function 'noFlop', ProfileFloatingPointBytesMoved = 0
+
+; Check function with floating point operations (one that moves data and one
+; that does not) but no profile data.
+define double @noProf() !dbg !300 {
+  ; CHECK: remark: test.c:4:9: in function 'noProf', double 'fadd' ('%fadd') has no profile data
+  %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !304
+  ; CHECK: remark: test.c:5:9: in function 'noProf', double 'load' ('%load') has no profile data
+  %load = load double, ptr null, align 4, !dbg !305
+  ret double 0.000000e+00, !dbg !306
+}
+; CHECK: remark: test.c:3:0: in function 'noProf', ProfileFloatingPointOpCount = 0
+; CHECK: remark: test.c:3:0: in function 'noProf', ProfileFloatingPointBytesMoved = 0
+
+; Check function with floating point operations and profile data.
+define double @f() !dbg !400 !prof !402 {
+  ; Check floating point operation in entry block, which has a count of 1 per
+  ; entry into the function.
+  ;
+  ; Also, check case of basic block with exactly 1 floating point operation.
+  %alloca = alloca double, align 8, addrspace(1), !dbg !405
+  ; CHECK: remark: test.c:10:9: in function 'f', double 'fadd' ('%fadd') executed 2 flops
+  %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !410
+  br label %.none, !dbg !405
+
+  ; Check floating point operation in ret block.
+  ;
+  ; branch_weights gives this block a count of 1 per entry into the function.
+.ret:                                         ; preds = %.many
+  ; CHECK: remark: test.c:20:9: in function 'f', double 'fsub' ('%fsub') executed 2 flops
+  %fsub = fsub double 0.000000e+00, 0.000000e+00, !dbg !420
+  ; CHECK: remark: test.c:21:9: in function 'f', double 'fmul' ('%fmul') executed 2 flops
+  %fmul = fmul double 0.000000e+00, 0.000000e+00, !dbg !421
+  ret double 0.000000e+00, !dbg !405
+
+  ; Check case of 0 floating point operations in a basic block.
+.none:                                         ; preds = %0
+  br label %.many, !dbg !405
+
+  ; Check case of many floating point operations in a basic block.
+  ;
+  ; branch_weights gives this block a count of 3 per entry into the function.
+.many:                                         ; preds = %.none, %.many
+  ; These are not counted as floating point ops even though they return floating
+  ; point values.  For AMD GPUs, we have seen no evidence that the hardware
+  ; instructions to which they lower ever trigger floating point hardware
+  ; counters.  More appear with conversions below.
+  %phi = phi double [ %fadd, %.none ], [ %load, %.many ], !dbg !405
+  %fneg = fneg double 0.000000e+00, !dbg !405
+  %copysign = call double @llvm.copysign.f64(double 0.000000e+00, double 0.000000e+00), !dbg !405
+  %fabs = call double @llvm.fabs.f64(double 0.000000e+00), !dbg !405
+  %floor = call double @llvm.floor.f64(double 0.000000e+00), !dbg !405
+  %ldexp = call double @llvm.ldexp.f64.i32(double 0.000000e+00, i32 0), !dbg !405
+  %minnum = call double @llvm.minnum.f64(double 0.000000e+00, double 0.000000e+00), !dbg !405
+  %rint = call double @llvm.rint.f64(double 0.000000e+00), !dbg !405
+
+  ; Check simple floating point ops not already checked above, and check an
+  ; unnamed value.
+  ;
+  ; CHECK: remark: test.c:30:9: in function 'f', double 'fdiv' ('%1') executed 6 flops x 14
+  %1 = fdiv double 0.000000e+00, 0.000000e+00, !dbg !430
+  ; CHECK: remark: test.c:31:9: in function 'f', double 'load' ('%load') moved 48 fp bytes
+  %load = load double, ptr addrspace(1) %alloca, align 8, !dbg !431
+  ; CHECK: remark: test.c:32:9: in function 'f', double 'store' moved 48 fp bytes
+  store double 0.000000e+00, ptr addrspace(1) %alloca, align 8, !dbg !432
+
+  ; Check atomicrmw.
+  ;
+  ; CHECK: remark: test.c:40:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes
+  atomicrmw xchg ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !440
+  ; CHECK: remark: test.c:41:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes
+  ; CHECK: remark: test.c:41:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 flops
+  atomicrmw fadd ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !441
+  ; CHECK: remark: test.c:42:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes
+  ; CHECK: remark: test.c:42:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 flops
+  atomicrmw fsub ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !442
+  ; CHECK: remark: test.c:43:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes
+  ; CHECK: remark: test.c:43:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 flops
+  atomicrmw fmax ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !443
+  ; CHECK: remark: test.c:44:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes
+  ; CHECK: remark: test.c:44:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 flops
+  atomicrmw fmin ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !444
+  ; atomicrmw that is not a floating point op.
+  atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !405
+
+  ; Check some flop intrinsics.
+  ;
+  ; CHECK: remark: test.c:50:9: in function 'f', double 'llvm.amdgcn.rcp.f64' call ('%rcp') executed 6 flops
+  %rcp = call double @llvm.amdgcn.rcp.f64(double 0.000000e+00), !dbg !450
+  ; CHECK: remark: test.c:51:9: in function 'f', double 'llvm.amdgcn.trig.preop.f64' call ('%trig.preop') executed 6 flops
+  %trig.preop = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 0), !dbg !451
+  ; CHECK: remark: test.c:52:9: in function 'f', double 'llvm.fma.f64' call ('%fma') executed 6 flops x 2
+  %fma = call double @llvm.fma.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00), !dbg !452
+  ; CHECK: remark: test.c:53:9: in function 'f', double 'llvm.fmuladd.f64' call ('%fmuladd') executed 6 flops x 2
+  %fmuladd = call double @llvm.fmuladd.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00), !dbg !453
+  ; CHECK: remark: test.c:54:9: in function 'f', double 'llvm.sin.f64' call ('%sin') executed 6 flops
+  %sin = call double @llvm.sin.f64(double 0.000000e+00), !dbg !454
+  ; CHECK: remark: test.c:55:9: in function 'f', double 'llvm.sqrt.f64' call ('%sqrt') executed 6 flops x 17
+  %sqrt = call double @llvm.sqrt.f64(double 0.000000e+00), !dbg !455
+  ; Intrinsic that is not a floating point op.
+  %umax = call i32 @llvm.umax.i32(i32 0, i32 0), !dbg !405
+
+  ; Check floating point types besides double scalar.
+  ;
+  ; CHECK: remark: test.c:60:9: in function 'f', float 'fadd' ('%float') executed 6 flops
+  %float = fadd float 0.000000e+00, 0.000000e+00, !dbg !460
+  ; CHECK: remark: test.c:61:9: in function 'f', float 'store' moved 24 fp bytes
+  store float 0.000000e+00, ptr null, align 8, !dbg !461
+  ; CHECK: remark: test.c:62:9: in function 'f', half 'fadd' ('%half') executed 6 flops
+  %half = fadd half 0.000000e+00, 0.000000e+00, !dbg !462
+  ; CHECK: remark: test.c:63:9: in function 'f', half 'store' moved 12 fp bytes
+  store half 0.000000e+00, ptr null, align 8, !dbg !463
+  ; CHECK: remark: test.c:64:9: in function 'f', bfloat 'fadd' ('%bfloat') executed 6 flops
+  %bfloat = fadd bfloat 0.000000e+00, 0.000000e+00, !dbg !464
+  ; CHECK: remark: test.c:65:9: in function 'f', bfloat 'store' moved 12 fp bytes
+  store bfloat 0.000000e+00, ptr null, align 8, !dbg !465
+  ; CHECK: remark: test.c:66:9: in function 'f', fp128 'fadd' ('%fp128') executed 6 flops
+  %fp128 = fadd fp128 0xL0, 0xL0, !dbg !466
+  ; CHECK: remark: test.c:67:9: in function 'f', fp128 'store' moved 96 fp bytes
+  store fp128 0xL0, ptr null, align 8, !dbg !467
+  ; CHECK: remark: test.c:68:9: in function 'f', <2 x double> 'fadd' ('%vector') executed 6 flops
+  %vector = fadd <2 x double> <double 0.000000e+00, double 0.000000e+00>, <double 0.000000e+00, double 0.000000e+00>, !dbg !468
+  ; CHECK: remark: test.c:69:9: in function 'f', <2 x double> 'store' moved 96 fp bytes
+  store <2 x double> <double 0.000000e+00, double 0.000000e+00>, ptr null, align 8, !dbg !469
+
+  ; Check conversions.
+  ;
+  ; CHECK: remark: test.c:70:9: in function 'f', double 'uitofp' ('%uitofp.64.64') executed 6 flops
+  %uitofp.64.64 = uitofp i64 0 to double, !dbg !470
+  ; CHECK: remark: test.c:71:9: in function 'f', double 'sitofp' ('%sitofp.64.64') executed 6 flops
+  %sitofp.64.64 = sitofp i64 0 to double, !dbg !471
+  ; CHECK: remark: test.c:72:9: in function 'f', double 'fptoui' ('%fptoui.64.64') executed 6 flops x 2
+  %fptoui.64.64 = fptoui double 0.000000e+00 to i64, !dbg !472
+  ; CHECK: remark: test.c:73:9: in function 'f', double 'fptosi' ('%fptosi.64.64') executed 6 flops x 2
+  %fptosi.64.64 = fptosi double 0.000000e+00 to i64, !dbg !473
+  %uitofp.32.64 = uitofp i32 0 to double, !dbg !405
+  %sitofp.32.64 = sitofp i32 0 to double, !dbg !405
+  %fptoui.64.32 = fptoui double 0.000000e+00 to i32, !dbg !405
+  %fptosi.64.32 = fptosi double 0.000000e+00 to i32, !dbg !405
+  %uitofp.64.32 = uitofp i64 0 to float, !dbg !405
+  %sitofp.64.32 = sitofp i64 0 to float, !dbg !405
+  %fptoui.32.64 = fptoui float 0.000000e+00 to i64, !dbg !405
+  %fptosi.32.64 = fptosi float 0.000000e+00 to i64, !dbg !405
+  %uitofp.32.32 = uitofp i32 0 to float, !dbg !405
+  %sitofp.32.32 = sitofp i32 0 to float, !dbg !405
+  %fptoui.32.32 = fptoui float 0.000000e+00 to i32, !dbg !405
+  %fptosi.32.32 = fptosi float 0.000000e+00 to i32, !dbg !405
+  %fptrunc.64.32 = fptrunc double 0.000000e+00 to float, !dbg !405
+  %fpext.32.64 = fpext float 0.000000e+00 to double, !dbg !405
+  %bitcast.double.i64 = bitcast double 0.000000e+00 to i64, !dbg !405
+  %bitcast.i64.double = bitcast i64 0 to double, !dbg !405
+
+  br i1 false, label %.ret, label %.many, !prof !499, !dbg !405
+}
+; CHECK: remark: test.c:4:0: in function 'f', ProfileFloatingPointOpCount = 324
+; CHECK: remark: test.c:4:0: in function 'f', ProfileFloatingPointBytesMoved = 576
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 20.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+
+!100 = distinct !DISubprogram(name: "noFlopOrProf", scope: !2, file: !2, line: 1, type: !101, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3)
+!101 = !DISubroutineType(types: !3)
+!103 = distinct !DILexicalBlock(scope: !104, file: !2, line: 1, column: 3)
+!104 = distinct !DILexicalBlock(scope: !100, file: !2, line: 1, column: 3)
+!105 = !DILocation(line: 1, column: 9, scope: !103)
+
+!200 = distinct !DISubprogram(name: "noFlop", scope: !2, file: !2, line: 2, type: !201, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3)
+!201 = !DISubroutineType(types: !3)
+!202 = !{!"function_entry_count", i64 5}
+!203 = distinct !DILexicalBlock(scope: !204, file: !2, line: 2, column: 3)
+!204 = distinct !DILexicalBlock(scope: !200, file: !2, line: 2, column: 3)
+!205 = !DILocation(line: 2, column: 9, scope: !203)
+
+!300 = distinct !DISubprogram(name: "noProf", scope: !2, file: !2, line: 3, type: !301, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3)
+!301 = !DISubroutineType(types: !3)
+!302 = distinct !DILexicalBlock(scope: !303, file: !2, line: 3, column: 3)
+!303 = distinct !DILexicalBlock(scope: !300, file: !2, line: 3, column: 3)
+!304 = !DILocation(line: 4, column: 9, scope: !302)
+!305 = !DILocation(line: 5, column: 9, scope: !302)
+!306 = !DILocation(line: 6, column: 9, scope: !302)
+
+!400 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 4, type: !401, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3)
+!401 = !DISubroutineType(types: !3)
+!402 = !{!"function_entry_count", i64 2}
+!403 = distinct !DILexicalBlock(scope: !404, file: !2, line: 6, column: 3)
+!404 = distinct !DILexicalBlock(scope: !400, file: !2, line: 6, column: 3)
+!405 = !DILocation(line: 999, column: 999, scope: !403)
+!410 = !DILocation(line: 10, column: 9, scope: !403)
+!420 = !DILocation(line: 20, column: 9, scope: !403)
+!421 = !DILocation(line: 21, column: 9, scope: !403)
+!430 = !DILocation(line: 30, column: 9, scope: !403)
+!431 = !DILocation(line: 31, column: 9, scope: !403)
+!432 = !DILocation(line: 32, column: 9, scope: !403)
+!440 = !DILocation(line: 40, column: 9, scope: !403)
+!441 = !DILocation(line: 41, column: 9, scope: !403)
+!442 = !DILocation(line: 42, column: 9, scope: !403)
+!443 = !DILocation(line: 43, column: 9, scope: !403)
+!444 = !DILocation(line: 44, column: 9, scope: !403)
+!450 = !DILocation(line: 50, column: 9, scope: !403)
+!451 = !DILocation(line: 51, column: 9, scope: !403)
+!452 = !DILocation(line: 52, column: 9, scope: !403)
+!453 = !DILocation(line: 53, column: 9, scope: !403)
+!454 = !DILocation(line: 54, column: 9, scope: !403)
+!455 = !DILocation(line: 55, column: 9, scope: !403)
+!460 = !DILocation(line: 60, column: 9, scope: !403)
+!461 = !DILocation(line: 61, column: 9, scope: !403)
+!462 = !DILocation(line: 62, column: 9, scope: !403)
+!463 = !DILocation(line: 63, column: 9, scope: !403)
+!464 = !DILocation(line: 64, column: 9, scope: !403)
+!465 = !DILocation(line: 65, column: 9, scope: !403)
+!466 = !DILocation(line: 66, column: 9, scope: !403)
+!467 = !DILocation(line: 67, column: 9, scope: !403)
+!468 = !DILocation(line: 68, column: 9, scope: !403)
+!469 = !DILocation(line: 69, column: 9, scope: !403)
+!470 = !DILocation(line: 70, column: 9, scope: !403)
+!471 = !DILocation(line: 71, column: 9, scope: !403)
+!472 = !DILocation(line: 72, column: 9, scope: !403)
+!473 = !DILocation(line: 73, column: 9, scope: !403)
+!499 = !{!"branch_weights", i32 1, i32 2}
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
index 049142732aa15..e40fdf1525069 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -10,7 +10,7 @@
 ;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
 ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes
-; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_init'
 ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f'
 ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
@@ -32,10 +32,12 @@
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ProfileFloatingPointOpCount = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ProfileFloatingPointBytesMoved = 0
 
 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
-; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
-; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' ('%[[#]]') accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 256
@@ -55,6 +57,8 @@
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ProfileFloatingPointOpCount = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ProfileFloatingPointBytesMoved = 0
 
 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes
@@ -77,6 +81,8 @@
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', ProfileFloatingPointOpCount = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', ProfileFloatingPointBytesMoved = 0
 ;  CHECK-NOT: {{.}}
 
 ; ModuleID = 'test-openmp-amdgcn-amd-amdhsa-gfx906.bc'
diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
index bd46741b24e8c..f6b3b117ab12f 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
@@ -10,7 +10,7 @@
 ;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
 ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes
-; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_init'
 ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f'
 ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
@@ -25,10 +25,12 @@
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ProfileFloatingPointOpCount = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ProfileFloatingPointBytesMoved = 0
 
 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
-; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
-; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' ('%[[#]]') accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 128
@@ -42,6 +44,8 @@
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ProfileFloatingPointOpCount = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ProfileFloatingPointBytesMoved = 0
 
 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes
@@ -57,6 +61,8 @@
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', ProfileFloatingPointOpCount = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', ProfileFloatingPointBytesMoved = 0
 ;  CHECK-NOT: remark: {{.*: in function 'g',.*}}
 
 ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't