From 8054de82dcdf13bba5739d561e5f269d99feb10b Mon Sep 17 00:00:00 2001
From: V Donaldson <vdonaldson@nvidia.com>
Date: Thu, 23 Jan 2025 10:09:28 -0800
Subject: [PATCH 1/2] [flang] IEEE underflow control for Arm

Update IEEE_SUPPORT_UNDERFLOW_CONTROL, IEEE_GET_UNDERFLOW_MODE, and
IEEE_SET_UNDERFLOW_MODE code for Arm.
---
 flang/include/flang/Tools/TargetSetup.h | 31 ++++++++++-----------
 flang/runtime/exceptions.cpp            | 36 ++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 22 deletions(-)
diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h
index d1b0da3a42c89..5d23df6823a94 100644
--- a/flang/include/flang/Tools/TargetSetup.h
+++ b/flang/include/flang/Tools/TargetSetup.h
@@ -24,34 +24,35 @@ namespace Fortran::tools {
     const std::string &compilerVersion, const std::string &compilerOptions) {
 
   const llvm::Triple &targetTriple{targetMachine.getTargetTriple()};
-  // FIXME: Handle real(3) ?
-  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
-    targetCharacteristics.DisableType(
-        Fortran::common::TypeCategory::Real, /*kind=*/10);
-  }
+
+  targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting, true);
+
   if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) {
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
   }
+
   if (targetTriple.isARM() || targetTriple.isAArch64()) {
     targetCharacteristics.set_haltingSupportIsUnknownAtCompileTime();
     targetCharacteristics.set_ieeeFeature(
         evaluate::IeeeFeature::Halting, false);
-  } else {
-    targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
+  }
+
+  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
+    targetCharacteristics.DisableType(
+        Fortran::common::TypeCategory::Real, /*kind=*/10);
   }
 
-  // Figure out if we can support F128: see
-  // flang/runtime/Float128Math/math-entries.h
-  // TODO: this should be taken from TargetInfo::getLongDoubleFormat to support
-  // cross-compilation
+  // Check for kind=16 support. See flang/runtime/Float128Math/math-entries.h.
+  // TODO: Take this from TargetInfo::getLongDoubleFormat for cross compilation.
 #ifdef FLANG_RUNTIME_F128_MATH_LIB
-  // we can use libquadmath wrappers
-  constexpr bool f128Support = true;
+  constexpr bool f128Support = true; // use libquadmath wrappers
 #elif HAS_LDBL128
-  // we can use libm wrappers
-  constexpr bool f128Support = true;
+  constexpr bool f128Support = true; // use libm wrappers
 #else
   constexpr bool f128Support = false;
 #endif
diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index f541b8e844ade..7fca0c431f8cd 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -11,7 +11,9 @@
 #include "flang/Runtime/exceptions.h"
 #include "terminator.h"
 #include <cfenv>
-#if __x86_64__
+#if __aarch64__
+#include <fpu_control.h>
+#elif __x86_64__
 #include <xmmintrin.h>
 #endif
 
@@ -90,20 +92,40 @@ bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) {
 #endif
 }
 
+// A hardware FZ (flush to zero) bit is the negation of the
+// ieee_[get|set]_underflow_mode GRADUAL argument.
+#if defined(_MM_FLUSH_ZERO_MASK)
+// The MXCSR FZ bit affects computations of real kinds 3, 4, and 8.
+#elif defined(_FPU_GETCW)
+// The FPCR FZ bit affects computations of real kinds 3, 4, and 8.
+// bit 24: FZ   -- single, double precision flush to zero bit
+// bit 19: FZ16 -- half precision flush to zero bit [not currently relevant]
+#define _FPU_FPCR_FZ_MASK_ 0x01080000
+#endif
+
 bool RTNAME(GetUnderflowMode)(void) {
-#if _MM_FLUSH_ZERO_MASK
-  // The MXCSR Flush to Zero flag is the negation of the ieee_get_underflow_mode
-  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
+#if defined(_MM_FLUSH_ZERO_MASK)
   return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
+#elif defined(_FPU_GETCW)
+  uint32_t fpcr;
+  _FPU_GETCW(fpcr);
+  return (fpcr & _FPU_FPCR_FZ_MASK_) != _FPU_FPCR_FZ_MASK_;
 #else
   return false;
 #endif
 }
 void RTNAME(SetUnderflowMode)(bool flag) {
-#if _MM_FLUSH_ZERO_MASK
-  // The MXCSR Flush to Zero flag is the negation of the ieee_set_underflow_mode
-  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
+#if defined(_MM_FLUSH_ZERO_MASK)
   _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON);
+#elif defined(_FPU_GETCW)
+  uint32_t fpcr;
+  _FPU_GETCW(fpcr);
+  if (flag) {
+    fpcr &= ~_FPU_FPCR_FZ_MASK_;
+  } else {
+    fpcr |= _FPU_FPCR_FZ_MASK_;
+  }
+  _FPU_SETCW(fpcr);
 #endif
 }
 

From d339c6538d84948555d28cb682b9c072a2ffc02a Mon Sep 17 00:00:00 2001
From: V Donaldson <vdonaldson@nvidia.com>
Date: Mon, 27 Jan 2025 07:14:33 -0800
Subject: [PATCH 2/2] [flang] arm build fix

---
 flang/runtime/exceptions.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index 7fca0c431f8cd..4ea11aa6ee8fc 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -108,7 +108,7 @@ bool RTNAME(GetUnderflowMode)(void) {
   return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
 #elif defined(_FPU_GETCW)
   uint32_t fpcr;
-  _FPU_GETCW(fpcr);
+  __asm__ __volatile__("mrs    %w0, fpcr" : "=r"(fpcr));
   return (fpcr & _FPU_FPCR_FZ_MASK_) != _FPU_FPCR_FZ_MASK_;
 #else
   return false;
@@ -119,13 +119,13 @@ void RTNAME(SetUnderflowMode)(bool flag) {
   _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON);
 #elif defined(_FPU_GETCW)
   uint32_t fpcr;
-  _FPU_GETCW(fpcr);
+  __asm__ __volatile__("mrs    %w0, fpcr" : "=r"(fpcr));
   if (flag) {
     fpcr &= ~_FPU_FPCR_FZ_MASK_;
   } else {
     fpcr |= _FPU_FPCR_FZ_MASK_;
   }
-  _FPU_SETCW(fpcr);
+  __asm__ __volatile__("msr    fpcr, %w0" : : "r"(fpcr));
 #endif
 }